TL;DR

Applying our zero-inflated model to the cortical data (all layets).

Unnormalized data

We select only the cells that pass QC, color coded by the layer of origin. I will focus on the top 1,000 most variable genes. Note that the data are not public, hence it will work only on Davide’s system (for now).

data("cortical")

glia0 <-c("DSJN001_N710_S505", "DSJN001_N702_S510", "DSJN003_N724_S521", "DSJN003_N721_S522", "DSJN003_N726_S522", "DSJN003_N728_S518", "DSJN003_N727_S517", "DSJN003_N728_S520", "DSJN003_N728_S515", "DSJN003_N728_S513", "DS07_N703_S506", "DS07_N703_S502", "DS07_N702_S506", "DS07_N701_S505", "DS07_N703_S505", "DS07_N701_S508", "DS07_N702_S505", "DS07_N704_S506", "DS07_N714_S506", "DS07_N715_S505", "DSJN001_N724_S503", "DSJN002_N703_S507", "DSJN002_N702_S502", "DSJN002_N705_S508", "DSJN002_N702_S510", "DSJN002_N711_S502", "DSJN003_N701_S507", "DSJN002_N706_S511", "DSJN002_N715_S502", "DSJN002_N712_S502", "DSJN003_N705_S506", "DSJN003_N704_S507", "DSJN002_N724_S515", "DSJN002_N722_S513", "DSJN002_N720_S521", "DSJN002_N719_S517", "DSJN002_N718_S521", "DSJN002_N728_S513", "DSJN003_N718_S513", "DSJN002_N728_S516", "DS08_N719_S503", "DS08_N723_S507")

# exclude glia and bulk samples
neurons <- cortical[,which(!(colnames(cortical) %in% glia0 | colData(cortical)$MD_single_bulk_pool == "B"))]

# removing ERCC spike ins 
neurons <- neurons[grep("^ERCC-", rownames(neurons), invert = TRUE),]

# removing low quality samples
sf <- metric_sample_filter(assay(neurons), nreads=colData(neurons)$NREADS, ralign=colData(neurons)$RALIGN, zcut=2)

keep <- which(!(sf$filtered_nreads | sf$filtered_ralign))
# plot(colData(neurons)$NREADS, colData(neurons)$RALIGN)
# points(colData(neurons)$NREADS[which((sf$filtered_nreads | sf$filtered_ralign))], colData(neurons)$RALIGN[which((sf$filtered_nreads | sf$filtered_ralign))], col=2, pch=20)

filtered <- neurons[, keep]

filter <- apply(assay(filtered)>10, 1, sum)>=10

Number of retained genes:

print(sum(filter))
## [1] 8721

To speed up the computations, I will focus on the top 1,000 most variable genes.

filtered <- filtered[filter,]
core <- assay(filtered)

vars <- rowVars(log1p(core))
names(vars) <- rownames(core)
vars <- sort(vars, decreasing = TRUE)
core <- core[names(vars)[1:1000],]

First, let’s look at PCA (of the log counts) for reference.

cols <- brewer.pal(9, "Set1")
colMerged <- cols[colData(filtered)$MD_cell_type]

detection_rate <- colSums(core>0)
coverage <- colSums(core)

pca <- prcomp(t(log1p(core)))
plot(pca$x, col=colMerged, pch=19, main="PCA of log-counts, centered not scaled")

df <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                        PC1          PC2 detection_rate    coverage
## PC1             1.00000000 -0.054470256     0.84600973 0.922858873
## PC2            -0.05447026  1.000000000     0.02734315 0.002126654
## detection_rate  0.84600973  0.027343152     1.00000000 0.625824488
## coverage        0.92285887  0.002126654     0.62582449 1.000000000

Now, let’s see how ZINB works.

print(system.time(zinb_Vall <- zinbFit(core, ncores = 3, K = 2)))
##    user  system elapsed 
## 227.633  59.520 127.503
plot(zinb_Vall@W, col=colMerged, pch=19, xlab="W1", ylab="W2", main="ZINB")

#total number of detected genes in the cell
df <- data.frame(W1=zinb_Vall@W[,1], W2=zinb_Vall@W[,2], gamma_mu = zinb_Vall@gamma_mu[1,], gamma_pi = zinb_Vall@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                           W1            W2    gamma_mu   gamma_pi
## W1              1.0000000000 -0.0004612369  0.70920538 -0.6747868
## W2             -0.0004612369  1.0000000000  0.08442253  0.1068222
## gamma_mu        0.7092053768  0.0844225267  1.00000000 -0.4496961
## gamma_pi       -0.6747868376  0.1068222474 -0.44969613  1.0000000
## detection_rate  0.7191271943 -0.0746530992  0.55727837 -0.9848206
## coverage        0.7223596417  0.0477250495  0.98785251 -0.5252069
##                detection_rate    coverage
## W1                  0.7191272  0.72235964
## W2                 -0.0746531  0.04772505
## gamma_mu            0.5572784  0.98785251
## gamma_pi           -0.9848206 -0.52520693
## detection_rate      1.0000000  0.62582449
## coverage            0.6258245  1.00000000

Accounting for batch effects

batch <- droplevels(colData(filtered)$MD_c1_run_id)
bio <- droplevels(colData(filtered)$MD_cell_type)

mod <- model.matrix(~batch)

colBatch <- tapply(colMerged, batch, function(x) x[1])

boxplot(df$W1~batch, col=colBatch)

boxplot(df$W2~batch, col=colBatch)

print(system.time(zinb_X <- zinbFit(core, ncores = 3, K = 2, X=mod)))
##    user  system elapsed 
## 807.717 346.243 435.708
plot(zinb_X@W, col=colMerged, pch=19, xlab="W1", ylab="W2", main="ZINB")

Not surprisingly, removing the batch effects also removes the biology, because of confounded design.

#total number of detected genes in the cell
df <- data.frame(W1=zinb_X@W[,1], W2=zinb_X@W[,2], gamma_mu = zinb_X@gamma_mu[1,], gamma_pi = zinb_X@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                         W1          W2   gamma_mu   gamma_pi
## W1              1.00000000 -0.06642216  0.5257920 -0.7578291
## W2             -0.06642216  1.00000000 -0.5618436  0.3444663
## gamma_mu        0.52579195 -0.56184363  1.0000000 -0.5913995
## gamma_pi       -0.75782915  0.34446628 -0.5913995  1.0000000
## detection_rate  0.75461410 -0.36227768  0.6667068 -0.9768876
## coverage        0.47169513 -0.58863094  0.9836926 -0.5541336
##                detection_rate   coverage
## W1                  0.7546141  0.4716951
## W2                 -0.3622777 -0.5886309
## gamma_mu            0.6667068  0.9836926
## gamma_pi           -0.9768876 -0.5541336
## detection_rate      1.0000000  0.6258245
## coverage            0.6258245  1.0000000
boxplot(df$W1~batch, col=colBatch)

boxplot(df$W2~batch, col=colBatch)

Accounting for quality

qc <- as.matrix(colData(filtered)[,1:14])
qcpca <- prcomp(qc, center=TRUE, scale.=TRUE)
mod <- model.matrix(~qcpca$x[,1:5])

plot(df$W1~qcpca$x[,1], pch=19, col=colMerged)

plot(df$W2~qcpca$x[,1], pch=19, col=colMerged)

print(system.time(zinb_q <- zinbFit(core, ncores = 3, K = 2, X=mod)))
##    user  system elapsed 
## 376.159  81.445 175.908
plot(zinb_q@W, col=colMerged, pch=19, xlab="W1", ylab="W2", main="ZINB")

#total number of detected genes in the cell
df <- data.frame(W1=zinb_q@W[,1], W2=zinb_q@W[,2], gamma_mu = zinb_q@gamma_mu[1,], gamma_pi = zinb_q@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                          W1          W2    gamma_mu   gamma_pi
## W1              1.000000000  0.05303947 -0.08038000  0.1263505
## W2              0.053039473  1.00000000 -0.04453885 -0.1485065
## gamma_mu       -0.080379998 -0.04453885  1.00000000 -0.5122902
## gamma_pi        0.126350534 -0.14850652 -0.51229019  1.0000000
## detection_rate -0.131804715  0.15596995  0.60868408 -0.9848994
## coverage       -0.008229641 -0.11917482  0.98324333 -0.5394186
##                detection_rate     coverage
## W1                 -0.1318047 -0.008229641
## W2                  0.1559700 -0.119174817
## gamma_mu            0.6086841  0.983243329
## gamma_pi           -0.9848994 -0.539418610
## detection_rate      1.0000000  0.625824488
## coverage            0.6258245  1.000000000
boxplot(df$W1~batch, col=colBatch)

boxplot(df$W2~batch, col=colBatch)

Better, but W may still be influenced by batch effects.

Three dimensions

print(system.time(zinb_3 <- zinbFit(core, ncores = 3, K = 3, X=mod)))
##    user  system elapsed 
## 414.668  84.572 192.097
pairs(zinb_3@W, col=colMerged, pch=19, main="ZINB")

#total number of detected genes in the cell
df <- data.frame(W1=zinb_3@W[,1], W2=zinb_3@W[,2], W3=zinb_3@W[,3], gamma_mu = zinb_3@gamma_mu[1,], gamma_pi = zinb_3@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                         W1          W2          W3    gamma_mu    gamma_pi
## W1              1.00000000 -0.01732276  0.01513701 -0.01169311  0.41084352
## W2             -0.01732276  1.00000000  0.03032168 -0.06768404  0.03405243
## W3              0.01513701  0.03032168  1.00000000 -0.01524494  0.13486914
## gamma_mu       -0.01169311 -0.06768404 -0.01524494  1.00000000 -0.56670097
## gamma_pi        0.41084352  0.03405243  0.13486914 -0.56670097  1.00000000
## detection_rate -0.41145339 -0.03753188 -0.11795051  0.65777702 -0.98460392
## coverage        0.09691373 -0.02171400 -0.05042026  0.98158891 -0.54548684
##                detection_rate    coverage
## W1                -0.41145339  0.09691373
## W2                -0.03753188 -0.02171400
## W3                -0.11795051 -0.05042026
## gamma_mu           0.65777702  0.98158891
## gamma_pi          -0.98460392 -0.54548684
## detection_rate     1.00000000  0.62582449
## coverage           0.62582449  1.00000000
boxplot(df$W1~batch, col=colBatch)

boxplot(df$W2~batch, col=colBatch)

boxplot(df$W3~batch, col=colBatch)

Can W capture sample quality?

# mod <- make_design(bio, batch, W=NULL, nested=TRUE)
mod <- model.matrix(~bio)
print(system.time(zinb_bio <- zinbFit(core, ncores = 3, K = 2, X=mod)))
##    user  system elapsed 
## 276.304  70.685 135.794
plot(zinb_bio@W, col=colMerged, pch=19, xlab="W1", ylab="W2", main="ZINB")

quality <- qcpca$x[,1]
data.frame(W1=zinb_bio@W[,1], W2=zinb_bio@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=quality), size=3) + scale_colour_gradient(low="blue", high="green") + theme_classic()

Comparing beta estimates with and without W

print(system.time(zinb_noW <- zinbFit(core, ncores = 3, K = 0, X=mod)))
##    user  system elapsed 
## 188.594  53.394 103.411

With W (sample quality)

par(mfrow=c(1, 2))
data(cortical_markers)
L4markers <- cortical_markers[cortical_markers[,2] %in% c("L4", "L2/3,L4") ,1]
L5markers <- cortical_markers[cortical_markers[,2] %in% c("L5", "L5b"), 1]
L6markers <- cortical_markers[cortical_markers[,2] %in% c("L6"), 1]

meanL4 <- zinb_bio@beta_mu[1,]
meanL5 <- zinb_bio@beta_mu[1,] + zinb_bio@beta_mu[2,]
meanL6 <- zinb_bio@beta_mu[1,] + zinb_bio@beta_mu[3,]

logFC <- zinb_bio@beta_mu[2,]
mean <- (meanL4+meanL5)/2
names(mean) <- names(logFC) <- rownames(core)
plot(mean, logFC, main="Layer 5 vs Layer 4")
points(mean[L4markers], logFC[L4markers], col=2, pch=20)
points(mean[L5markers], logFC[L5markers], col=4, pch=20)
legend("topright", c("L4 markers", "L5 markers"), pch=20, col=c(2, 4))

logFC <- zinb_bio@beta_mu[3,]
mean <- (meanL4+meanL6)/2
names(mean) <- names(logFC) <- rownames(core)
plot(mean, logFC, main="Layer 6 vs Layer 4")
points(mean[L4markers], logFC[L4markers], col=2, pch=20)
points(mean[L6markers], logFC[L6markers], col=4, pch=20)
legend("topright", c("L4 markers", "L6 markers"), pch=20, col=c(2, 4))

Without W (sample quality)

par(mfrow=c(1, 2))
meanL4 <- zinb_noW@beta_mu[1,]
meanL5 <- zinb_noW@beta_mu[1,] + zinb_noW@beta_mu[2,]
meanL6 <- zinb_noW@beta_mu[1,] + zinb_noW@beta_mu[3,]

logFC <- zinb_noW@beta_mu[2,]
mean <- (meanL4+meanL5)/2
names(mean) <- names(logFC) <- rownames(core)
plot(mean, logFC, main="Layer 5 vs Layer 4")
points(mean[L4markers], logFC[L4markers], col=2, pch=20)
points(mean[L5markers], logFC[L5markers], col=4, pch=20)
legend("topright", c("L4 markers", "L5 markers"), pch=20, col=c(2, 4))

logFC <- zinb_noW@beta_mu[3,]
mean <- (meanL4+meanL6)/2
names(mean) <- names(logFC) <- rownames(core)
plot(mean, logFC, main="Layer 6 vs Layer 4")
points(mean[L4markers], logFC[L4markers], col=2, pch=20)
points(mean[L6markers], logFC[L6markers], col=4, pch=20)
legend("topright", c("L4 markers", "L6 markers"), pch=20, col=c(2, 4))